Uses the 'used car prices' dataset to train a SageMaker XGBoost model capable of predicting the MSRP (Manufacturer's Suggested Retail Price). Deploys an unoptimised model and collects Key Performance Indicators (KPIs). Utilises SageMaker Hyperparameter Tuning Jobs to perform hyperparameter optimisation. Deploys the optimised model and collects Key Performance Indicators (KPIs).
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import zipfile
%matplotlib inline
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
import sagemaker
import boto3
from sagemaker import Session
import os
from sagemaker.serializers import CSVSerializer
from sagemaker.deserializers import JSONDeserializer
!pip install wordcloud
from wordcloud import WordCloud, STOPWORDS
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from math import sqrt
from sagemaker.tuner import IntegerParameter, CategoricalParameter, ContinuousParameter, HyperparameterTuner
from sagemaker.analytics import HyperparameterTuningJobAnalytics
Requirement already satisfied: wordcloud in /opt/conda/lib/python3.7/site-packages (1.9.2) Requirement already satisfied: numpy>=1.6.1 in /opt/conda/lib/python3.7/site-packages (from wordcloud) (1.21.6) Requirement already satisfied: pillow in /opt/conda/lib/python3.7/site-packages (from wordcloud) (9.5.0) Requirement already satisfied: matplotlib in /opt/conda/lib/python3.7/site-packages (from wordcloud) (3.1.3) Requirement already satisfied: cycler>=0.10 in /opt/conda/lib/python3.7/site-packages (from matplotlib->wordcloud) (0.10.0) Requirement already satisfied: kiwisolver>=1.0.1 in /opt/conda/lib/python3.7/site-packages (from matplotlib->wordcloud) (1.1.0) Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.1 in /opt/conda/lib/python3.7/site-packages (from matplotlib->wordcloud) (2.4.6) Requirement already satisfied: python-dateutil>=2.1 in /opt/conda/lib/python3.7/site-packages (from matplotlib->wordcloud) (2.8.2) Requirement already satisfied: six in /opt/conda/lib/python3.7/site-packages (from cycler>=0.10->matplotlib->wordcloud) (1.14.0) Requirement already satisfied: setuptools in /opt/conda/lib/python3.7/site-packages (from kiwisolver>=1.0.1->matplotlib->wordcloud) (65.5.1) WARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv
# import the csv files using pandas
car_df = pd.read_csv('data/used_car_price.csv')
car_df
| Make | Model | Type | Origin | DriveTrain | MSRP | EngineSize | Cylinders | Horsepower | MPG_City | MPG_Highway | Weight | Wheelbase | Length | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Acura | MDX | SUV | Asia | All | 36945 | 3.5 | 6 | 265 | 17 | 23 | 4451 | 106 | 189 |
| 1 | Acura | RSX Type S 2dr | Sedan | Asia | Front | 23820 | 2.0 | 4 | 200 | 24 | 31 | 2778 | 101 | 172 |
| 2 | Acura | TSX 4dr | Sedan | Asia | Front | 26990 | 2.4 | 4 | 200 | 22 | 29 | 3230 | 105 | 183 |
| 3 | Acura | TL 4dr | Sedan | Asia | Front | 33195 | 3.2 | 6 | 270 | 20 | 28 | 3575 | 108 | 186 |
| 4 | Acura | 3.5 RL 4dr | Sedan | Asia | Front | 43755 | 3.5 | 6 | 225 | 18 | 24 | 3880 | 115 | 197 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 423 | Volvo | C70 LPT convertible 2dr | Sedan | Europe | Front | 40565 | 2.4 | 5 | 197 | 21 | 28 | 3450 | 105 | 186 |
| 424 | Volvo | C70 HPT convertible 2dr | Sedan | Europe | Front | 42565 | 2.3 | 5 | 242 | 20 | 26 | 3450 | 105 | 186 |
| 425 | Volvo | S80 T6 4dr | Sedan | Europe | Front | 45210 | 2.9 | 6 | 268 | 19 | 26 | 3653 | 110 | 190 |
| 426 | Volvo | V40 | Wagon | Europe | Front | 26135 | 1.9 | 4 | 170 | 22 | 29 | 2822 | 101 | 180 |
| 427 | Volvo | XC70 | Wagon | Europe | All | 35145 | 2.5 | 5 | 208 | 20 | 27 | 3823 | 109 | 186 |
428 rows × 14 columns
# view first 5 rows
car_df.head(5)
| Make | Model | Type | Origin | DriveTrain | MSRP | EngineSize | Cylinders | Horsepower | MPG_City | MPG_Highway | Weight | Wheelbase | Length | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Acura | MDX | SUV | Asia | All | 36945 | 3.5 | 6 | 265 | 17 | 23 | 4451 | 106 | 189 |
| 1 | Acura | RSX Type S 2dr | Sedan | Asia | Front | 23820 | 2.0 | 4 | 200 | 24 | 31 | 2778 | 101 | 172 |
| 2 | Acura | TSX 4dr | Sedan | Asia | Front | 26990 | 2.4 | 4 | 200 | 22 | 29 | 3230 | 105 | 183 |
| 3 | Acura | TL 4dr | Sedan | Asia | Front | 33195 | 3.2 | 6 | 270 | 20 | 28 | 3575 | 108 | 186 |
| 4 | Acura | 3.5 RL 4dr | Sedan | Asia | Front | 43755 | 3.5 | 6 | 225 | 18 | 24 | 3880 | 115 | 197 |
# view last 5 rows
car_df.tail(5)
| Make | Model | Type | Origin | DriveTrain | MSRP | EngineSize | Cylinders | Horsepower | MPG_City | MPG_Highway | Weight | Wheelbase | Length | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 423 | Volvo | C70 LPT convertible 2dr | Sedan | Europe | Front | 40565 | 2.4 | 5 | 197 | 21 | 28 | 3450 | 105 | 186 |
| 424 | Volvo | C70 HPT convertible 2dr | Sedan | Europe | Front | 42565 | 2.3 | 5 | 242 | 20 | 26 | 3450 | 105 | 186 |
| 425 | Volvo | S80 T6 4dr | Sedan | Europe | Front | 45210 | 2.9 | 6 | 268 | 19 | 26 | 3653 | 110 | 190 |
| 426 | Volvo | V40 | Wagon | Europe | Front | 26135 | 1.9 | 4 | 170 | 22 | 29 | 2822 | 101 | 180 |
| 427 | Volvo | XC70 | Wagon | Europe | All | 35145 | 2.5 | 5 | 208 | 20 | 27 | 3823 | 109 | 186 |
# show general descriptive statistics
car_df.describe()
| MSRP | EngineSize | Cylinders | Horsepower | MPG_City | MPG_Highway | Weight | Wheelbase | Length | |
|---|---|---|---|---|---|---|---|---|---|
| count | 428.000000 | 428.000000 | 428.000000 | 428.000000 | 428.000000 | 428.000000 | 428.000000 | 428.000000 | 428.000000 |
| mean | 32774.855140 | 3.196729 | 5.799065 | 215.885514 | 20.060748 | 26.843458 | 3577.953271 | 108.154206 | 186.362150 |
| std | 19431.716674 | 1.108595 | 1.559679 | 71.836032 | 5.238218 | 5.741201 | 758.983215 | 8.311813 | 14.357991 |
| min | 10280.000000 | 1.300000 | 3.000000 | 73.000000 | 10.000000 | 12.000000 | 1850.000000 | 89.000000 | 143.000000 |
| 25% | 20334.250000 | 2.375000 | 4.000000 | 165.000000 | 17.000000 | 24.000000 | 3104.000000 | 103.000000 | 178.000000 |
| 50% | 27635.000000 | 3.000000 | 6.000000 | 210.000000 | 19.000000 | 26.000000 | 3474.500000 | 107.000000 | 187.000000 |
| 75% | 39205.000000 | 3.900000 | 6.000000 | 255.000000 | 21.250000 | 29.000000 | 3977.750000 | 112.000000 | 194.000000 |
| max | 192465.000000 | 8.300000 | 12.000000 | 500.000000 | 60.000000 | 66.000000 | 7190.000000 | 144.000000 | 238.000000 |
# show column data type info
car_df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 428 entries, 0 to 427 Data columns (total 14 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Make 428 non-null object 1 Model 428 non-null object 2 Type 428 non-null object 3 Origin 428 non-null object 4 DriveTrain 428 non-null object 5 MSRP 428 non-null int64 6 EngineSize 428 non-null float64 7 Cylinders 428 non-null int64 8 Horsepower 428 non-null int64 9 MPG_City 428 non-null int64 10 MPG_Highway 428 non-null int64 11 Weight 428 non-null int64 12 Wheelbase 428 non-null int64 13 Length 428 non-null int64 dtypes: float64(1), int64(8), object(5) memory usage: 46.9+ KB
# show count unique entries for each column
car_df.nunique()
Make 38 Model 425 Type 6 Origin 3 DriveTrain 3 MSRP 410 EngineSize 43 Cylinders 7 Horsepower 110 MPG_City 28 MPG_Highway 33 Weight 348 Wheelbase 40 Length 67 dtype: int64
# generate pairplot for all car_df data
sns.pairplot(car_df);
# extract the numeric columns
X_numerical = car_df[['EngineSize', 'Cylinders', 'Horsepower', 'MPG_City',
'MPG_Highway', 'Weight', 'Wheelbase', 'Length', 'MSRP']]
# generate pairplot for numeric values only
sns.pairplot(X_numerical);
# generate a heatmap for numeric values
plt.figure(figsize = (8, 6))
sns.heatmap(X_numerical.corr(), annot = True);
# generate a box plot for automobile types
plt.figure(figsize = (16, 8))
sns.countplot(x = car_df['Type'])
locs, labels = plt.xticks();
plt.setp(labels, rotation = 45);
# generate a scatterplot for Horsepower and MSRP
sns.scatterplot(x = 'Horsepower', y = 'MSRP', data = car_df)
<matplotlib.axes._subplots.AxesSubplot at 0x7fb4e5a047d0>
# generate a word cloud for automobile Model values
text = car_df.Model.values
stopwords = set(STOPWORDS)
wc = WordCloud(background_color = "black", max_words = 2000, max_font_size = 100, random_state = 3,
stopwords = stopwords, contour_width = 3).generate(str(text))
fig = plt.figure(figsize = (25, 15))
plt.imshow(wc, interpolation = "bilinear")
plt.axis("off")
plt.show()
# Perform One-Hot Encoding for columns with categorical data
car_df = pd.get_dummies(car_df, columns=["Make", "Model", "Type", "Origin", "DriveTrain"])
car_df.head(5)
| MSRP | EngineSize | Cylinders | Horsepower | MPG_City | MPG_Highway | Weight | Wheelbase | Length | Make_Acura | ... | Type_Sedan | Type_Sports | Type_Truck | Type_Wagon | Origin_Asia | Origin_Europe | Origin_USA | DriveTrain_All | DriveTrain_Front | DriveTrain_Rear | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 36945 | 3.5 | 6 | 265 | 17 | 23 | 4451 | 106 | 189 | 1 | ... | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 |
| 1 | 23820 | 2.0 | 4 | 200 | 24 | 31 | 2778 | 101 | 172 | 1 | ... | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 |
| 2 | 26990 | 2.4 | 4 | 200 | 22 | 29 | 3230 | 105 | 183 | 1 | ... | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 |
| 3 | 33195 | 3.2 | 6 | 270 | 20 | 28 | 3575 | 108 | 186 | 1 | ... | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 |
| 4 | 43755 | 3.5 | 6 | 225 | 18 | 24 | 3880 | 115 | 197 | 1 | ... | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 |
5 rows × 484 columns
# assign input features (independent variables) to X and output feature (dependent variable) to y
X = car_df.drop("MSRP", axis = 1)
y = car_df["MSRP"]
# X = np.array(X)
# y = np.array(y)
X = np.array(X).astype('float32')
y = np.array(y).astype('float32')
# view X shape
X.shape
(428, 483)
# view y shape
y.shape
(428,)
# split the data into train & testing sets (85% of data for training)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.15)
# split test data set into test set & validation set (50% of test data will be used for validation)
X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size = 0.5)
# view shape of training data independent variables
X_train.shape
(363, 483)
# view shape of test data independent variable
X_test.shape
(32, 483)
# view shape of test data independent variable
X_val.shape
(33, 483)
# view shape of training data dependent variable
y_train.shape
(363,)
# convert the training & validation arrays into dataframes with the dependent variable set as the first column.
# this is because sagemaker built-in algorithm expects the data in this format.
train_data = pd.DataFrame({'Target': y_train})
for i in range(X_train.shape[1]):
train_data[i] = X_train[:,i]
val_data = pd.DataFrame({'Target':y_val})
for i in range(X_val.shape[1]):
val_data[i] = X_val[:,i]
/opt/conda/lib/python3.7/site-packages/ipykernel_launcher.py:6: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` /opt/conda/lib/python3.7/site-packages/ipykernel_launcher.py:10: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` # Remove the CWD from sys.path while we load stuff.
# view validation data
val_data.head(5)
| Target | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | ... | 473 | 474 | 475 | 476 | 477 | 478 | 479 | 480 | 481 | 482 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 32415.0 | 3.0 | 6.0 | 215.0 | 18.0 | 24.0 | 3285.0 | 105.0 | 177.0 | 0.0 | ... | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
| 1 | 49995.0 | 6.0 | 8.0 | 316.0 | 10.0 | 12.0 | 6400.0 | 123.0 | 190.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 |
| 2 | 21595.0 | 3.0 | 6.0 | 155.0 | 20.0 | 27.0 | 3308.0 | 109.0 | 200.0 | 0.0 | ... | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 |
| 3 | 10539.0 | 1.6 | 4.0 | 103.0 | 29.0 | 33.0 | 2255.0 | 96.0 | 167.0 | 0.0 | ... | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 |
| 4 | 36100.0 | 5.3 | 8.0 | 295.0 | 14.0 | 18.0 | 5678.0 | 130.0 | 222.0 | 0.0 | ... | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 |
5 rows × 484 columns
# view train_data
train_data.head(5)
| Target | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | ... | 473 | 474 | 475 | 476 | 477 | 478 | 479 | 480 | 481 | 482 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 29995.0 | 2.5 | 6.0 | 192.0 | 18.0 | 26.0 | 3428.0 | 107.0 | 184.0 | 0.0 | ... | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 |
| 1 | 25045.0 | 2.0 | 4.0 | 227.0 | 20.0 | 27.0 | 3085.0 | 99.0 | 174.0 | 0.0 | ... | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 |
| 2 | 20220.0 | 2.4 | 4.0 | 150.0 | 21.0 | 28.0 | 3175.0 | 108.0 | 191.0 | 0.0 | ... | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 |
| 3 | 22035.0 | 2.7 | 6.0 | 200.0 | 21.0 | 29.0 | 3469.0 | 113.0 | 204.0 | 0.0 | ... | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 |
| 4 | 17163.0 | 2.5 | 6.0 | 165.0 | 19.0 | 22.0 | 3020.0 | 98.0 | 163.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 |
5 rows × 484 columns
# save train_data and validation_data as csv files.
train_data.to_csv('train.csv', header = False, index = False)
val_data.to_csv('validation.csv', header = False, index = False)
# create a Sagemaker session
sagemaker_session = sagemaker.Session()
bucket = Session().default_bucket()
prefix = 'XGBoost-Regressor'
key = 'XGBoost-Regressor'
# declare role to give learning and hosting access to the data
role = sagemaker.get_execution_role()
# upload the training data csv to the s3 bucket
with open('train.csv','rb') as f:
boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'train', key)).upload_fileobj(f)
s3_train_data = 's3://{}/{}/train/{}'.format(bucket, prefix, key)
print('uploaded training data location: {}'.format(s3_train_data))
uploaded training data location: s3://sagemaker-us-east-1-628298263648/XGBoost-Regressor/train/XGBoost-Regressor
# upload the training data csv to the s3 bucket
with open('validation.csv','rb') as f:
boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'validation', key)).upload_fileobj(f)
s3_validation_data = 's3://{}/{}/validation/{}'.format(bucket, prefix, key)
print('uploaded validation data location: {}'.format(s3_validation_data))
uploaded validation data location: s3://sagemaker-us-east-1-628298263648/XGBoost-Regressor/validation/XGBoost-Regressor
# creates output placeholder in S3 bucket to store the trained model
output_location = 's3://{}/{}/output'.format(bucket, prefix)
print('training artifacts will be uploaded to: {}'.format(output_location))
training artifacts will be uploaded to: s3://sagemaker-us-east-1-628298263648/XGBoost-Regressor/output
# create a reference to the XGBoost container image
container = sagemaker.image_uris.retrieve("xgboost", boto3.Session().region_name, version='1.0-1')
INFO:sagemaker.image_uris:Defaulting to only available Python version: py3 INFO:sagemaker.image_uris:Defaulting to only supported image scope: cpu.
# create a SageMaker XGBoost regressor
regressor = sagemaker.estimator.Estimator(container,
role,
instance_count = 1,
instance_type = 'ml.m4.xlarge',
output_path = output_location,
sagemaker_session = sagemaker_session)
# set the regressor hyperparameters
regressor.set_hyperparameters(max_depth = 2,
objective = 'reg:squarederror',
colsample_bytree = 0.3,
alpha = 0.1,
eta = 0.8,
num_round = 100)
# define the regressor data channels (training and validation data)
train_input = sagemaker.session.TrainingInput(s3_data = s3_train_data, content_type = 'csv',s3_data_type = 'S3Prefix')
valid_input = sagemaker.session.TrainingInput(s3_data = s3_validation_data, content_type = 'csv',s3_data_type = 'S3Prefix')
data_channels = {'train': train_input,'validation': valid_input}
# train the data model using the data channels
regressor.fit(data_channels)
INFO:sagemaker:Creating training-job with name: sagemaker-xgboost-2023-07-12-04-41-16-919
2023-07-12 04:41:17 Starting - Starting the training job... 2023-07-12 04:41:41 Starting - Preparing the instances for training......... 2023-07-12 04:42:50 Downloading - Downloading input data... 2023-07-12 04:43:20 Training - Downloading the training image... 2023-07-12 04:44:06 Training - Training image download completed. Training in progress...[2023-07-12 04:44:22.806 ip-10-0-126-189.ec2.internal:7 INFO utils.py:27] RULE_JOB_STOP_SIGNAL_FILENAME: None INFO:sagemaker-containers:Imported framework sagemaker_xgboost_container.training INFO:sagemaker-containers:Failed to parse hyperparameter objective value reg:squarederror to Json. Returning the value itself INFO:sagemaker-containers:No GPUs detected (normal if no gpus installed) INFO:sagemaker_xgboost_container.training:Running XGBoost Sagemaker in algorithm mode INFO:root:Determined delimiter of CSV input is ',' INFO:root:Determined delimiter of CSV input is ',' INFO:root:Determined delimiter of CSV input is ',' INFO:root:Determined delimiter of CSV input is ',' INFO:root:Single node training. [04:44:22] 363x483 matrix with 175329 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=, [04:44:22] 33x483 matrix with 15939 entries loaded from /opt/ml/input/data/validation?format=csv&label_column=0&delimiter=, [2023-07-12 04:44:22.913 ip-10-0-126-189.ec2.internal:7 INFO json_config.py:91] Creating hook from json_config at /opt/ml/input/config/debughookconfig.json. [2023-07-12 04:44:22.914 ip-10-0-126-189.ec2.internal:7 INFO hook.py:201] tensorboard_dir has not been set for the hook. SMDebug will not be exporting tensorboard summaries. [2023-07-12 04:44:22.915 ip-10-0-126-189.ec2.internal:7 INFO profiler_config_parser.py:102] User has disabled profiler. [2023-07-12 04:44:22.915 ip-10-0-126-189.ec2.internal:7 INFO hook.py:255] Saving to /opt/ml/output/tensors [2023-07-12 04:44:22.916 ip-10-0-126-189.ec2.internal:7 INFO state_store.py:77] The checkpoint config file /opt/ml/input/config/checkpointconfig.json does not exist. INFO:root:Debug hook created from config INFO:root:Train matrix has 363 rows INFO:root:Validation matrix has 33 rows [04:44:22] WARNING: /workspace/src/learner.cc:328: Parameters: { num_round } might not be used. This may not be accurate due to some parameters are only used in language bindings but passed down to XGBoost core. Or some parameters are not used but slip through this verification. Please open an issue if you find above cases. [0]#011train-rmse:19775.06836#011validation-rmse:13750.17969 [2023-07-12 04:44:22.923 ip-10-0-126-189.ec2.internal:7 INFO hook.py:423] Monitoring the collections: metrics [2023-07-12 04:44:22.926 ip-10-0-126-189.ec2.internal:7 INFO hook.py:486] Hook is writing from the hook with pid: 7 [1]#011train-rmse:13419.04590#011validation-rmse:8918.25000 [2]#011train-rmse:12009.40234#011validation-rmse:6240.87793 [3]#011train-rmse:9563.39160#011validation-rmse:4253.80664 [4]#011train-rmse:8927.45215#011validation-rmse:4231.82861 [5]#011train-rmse:7974.63867#011validation-rmse:4171.27734 [6]#011train-rmse:7583.79492#011validation-rmse:4195.99219 [7]#011train-rmse:7317.80859#011validation-rmse:4293.19482 [8]#011train-rmse:6972.04150#011validation-rmse:4698.49902 [9]#011train-rmse:6766.63867#011validation-rmse:4686.06494 [10]#011train-rmse:6319.78613#011validation-rmse:4687.43018 [11]#011train-rmse:6015.90430#011validation-rmse:4685.59082 [12]#011train-rmse:5886.37256#011validation-rmse:4575.36230 [13]#011train-rmse:5757.32178#011validation-rmse:4463.19678 [14]#011train-rmse:5595.37451#011validation-rmse:4462.85205 [15]#011train-rmse:5492.44189#011validation-rmse:4462.45215 [16]#011train-rmse:5414.64209#011validation-rmse:4462.50293 [17]#011train-rmse:5322.20068#011validation-rmse:4352.50000 [18]#011train-rmse:5136.00732#011validation-rmse:4472.11572 [19]#011train-rmse:4980.31299#011validation-rmse:4070.96045 [20]#011train-rmse:4875.25488#011validation-rmse:4049.06250 [21]#011train-rmse:4767.99707#011validation-rmse:4100.53955 [22]#011train-rmse:4681.21582#011validation-rmse:4109.30811 [23]#011train-rmse:4579.38916#011validation-rmse:4109.82520 [24]#011train-rmse:4502.97119#011validation-rmse:3915.68457 [25]#011train-rmse:4430.17627#011validation-rmse:3821.49731 [26]#011train-rmse:4348.21680#011validation-rmse:3813.33862 [27]#011train-rmse:4242.82373#011validation-rmse:3775.70923 [28]#011train-rmse:4106.54736#011validation-rmse:3751.12085 [29]#011train-rmse:4029.22046#011validation-rmse:3729.27905 [30]#011train-rmse:3967.79663#011validation-rmse:3712.57300 [31]#011train-rmse:3917.28540#011validation-rmse:3736.67847 [32]#011train-rmse:3858.30078#011validation-rmse:3776.01660 [33]#011train-rmse:3809.44116#011validation-rmse:3796.72241 [34]#011train-rmse:3772.58154#011validation-rmse:3801.50586 [35]#011train-rmse:3731.72778#011validation-rmse:3804.65991 [36]#011train-rmse:3697.03857#011validation-rmse:3802.85767 [37]#011train-rmse:3640.84155#011validation-rmse:3798.53613 [38]#011train-rmse:3607.31299#011validation-rmse:3803.26099 [39]#011train-rmse:3553.79492#011validation-rmse:3768.98096 [40]#011train-rmse:3506.58862#011validation-rmse:3822.59692 [41]#011train-rmse:3474.68555#011validation-rmse:3825.66284 [42]#011train-rmse:3439.99145#011validation-rmse:3823.68359 [43]#011train-rmse:3413.24854#011validation-rmse:3836.40845 [44]#011train-rmse:3383.10742#011validation-rmse:3839.85181 [45]#011train-rmse:3351.41650#011validation-rmse:3833.96411 [46]#011train-rmse:3323.72266#011validation-rmse:3839.06372 [47]#011train-rmse:3296.10913#011validation-rmse:3833.99976 [48]#011train-rmse:3229.77783#011validation-rmse:3983.74341 [49]#011train-rmse:3204.45337#011validation-rmse:3984.44605 [50]#011train-rmse:3113.18262#011validation-rmse:3772.62012 [51]#011train-rmse:3084.34644#011validation-rmse:3778.83765 [52]#011train-rmse:3043.14819#011validation-rmse:3855.03076 [53]#011train-rmse:3018.33838#011validation-rmse:3861.96460 [54]#011train-rmse:2996.06250#011validation-rmse:3859.80518 [55]#011train-rmse:2923.38672#011validation-rmse:3771.47827 [56]#011train-rmse:2875.32764#011validation-rmse:3781.29224 [57]#011train-rmse:2824.13330#011validation-rmse:3792.27148 [58]#011train-rmse:2800.38403#011validation-rmse:3785.32080 [59]#011train-rmse:2781.01221#011validation-rmse:3733.85791 [60]#011train-rmse:2755.07837#011validation-rmse:3691.21924 [61]#011train-rmse:2735.11475#011validation-rmse:3700.38599 [62]#011train-rmse:2681.56860#011validation-rmse:3601.45606 [63]#011train-rmse:2657.38159#011validation-rmse:3601.05493 [64]#011train-rmse:2637.68896#011validation-rmse:3607.94580 [65]#011train-rmse:2596.87256#011validation-rmse:3601.04590 [66]#011train-rmse:2575.52441#011validation-rmse:3594.25879 [67]#011train-rmse:2550.58228#011validation-rmse:3644.05811 [68]#011train-rmse:2533.90479#011validation-rmse:3633.46436 [69]#011train-rmse:2518.07373#011validation-rmse:3649.48438 [70]#011train-rmse:2474.90454#011validation-rmse:3660.59766 [71]#011train-rmse:2452.89795#011validation-rmse:3697.63599 [72]#011train-rmse:2417.34448#011validation-rmse:3655.52734 [73]#011train-rmse:2400.92163#011validation-rmse:3654.64551 [74]#011train-rmse:2379.95117#011validation-rmse:3654.55151 [75]#011train-rmse:2361.80322#011validation-rmse:3654.56055 [76]#011train-rmse:2344.76562#011validation-rmse:3654.26416 [77]#011train-rmse:2328.04028#011validation-rmse:3648.45264 [78]#011train-rmse:2307.44775#011validation-rmse:3644.38721 [79]#011train-rmse:2291.42139#011validation-rmse:3653.07153 [80]#011train-rmse:2274.74585#011validation-rmse:3639.19385 [81]#011train-rmse:2259.63135#011validation-rmse:3639.50610 [82]#011train-rmse:2245.49414#011validation-rmse:3639.52148 [83]#011train-rmse:2232.15601#011validation-rmse:3608.14502 [84]#011train-rmse:2218.81274#011validation-rmse:3587.85352 [85]#011train-rmse:2206.81030#011validation-rmse:3626.61816 [86]#011train-rmse:2174.26514#011validation-rmse:3601.70019 [87]#011train-rmse:2142.03076#011validation-rmse:3545.90234 [88]#011train-rmse:2127.27075#011validation-rmse:3547.81445 [89]#011train-rmse:2113.85034#011validation-rmse:3548.08691 [90]#011train-rmse:2099.89062#011validation-rmse:3548.19043 [91]#011train-rmse:2086.14453#011validation-rmse:3543.20361 [92]#011train-rmse:2072.02124#011validation-rmse:3550.29028 [93]#011train-rmse:2058.89209#011validation-rmse:3548.68164 [94]#011train-rmse:2045.71313#011validation-rmse:3555.59033 [95]#011train-rmse:2034.01318#011validation-rmse:3553.95679 [96]#011train-rmse:2021.20642#011validation-rmse:3558.53955 [97]#011train-rmse:2012.14270#011validation-rmse:3563.92920 [98]#011train-rmse:1991.74536#011validation-rmse:3599.07153 [99]#011train-rmse:1978.83618#011validation-rmse:3561.62354 2023-07-12 04:44:42 Uploading - Uploading generated training model 2023-07-12 04:44:42 Completed - Training job completed Training seconds: 112 Billable seconds: 112
# deploy the model
predictor = regressor.deploy(initial_instance_count = 1, instance_type = "ml.m4.xlarge")
INFO:sagemaker:Creating model with name: sagemaker-xgboost-2023-07-12-04-44-59-841 INFO:sagemaker:Creating endpoint-config with name sagemaker-xgboost-2023-07-12-04-44-59-841 INFO:sagemaker:Creating endpoint with name sagemaker-xgboost-2023-07-12-04-44-59-841
-------!
# obtain predictions for test data
predictor.serializer = CSVSerializer()
predictions = predictor.predict(X_test)
predictions
b'29380.958984375,32496.431640625,13725.9501953125,23666.615234375,16972.994140625,26912.982421875,26309.08203125,26398.7109375,21943.90234375,31553.564453125,16337.021484375,16972.994140625,33116.05078125,44037.2109375,21785.392578125,13725.9501953125,97984.78125,30746.869140625,22088.958984375,38233.84375,37346.73046875,23325.234375,20753.84765625,34956.23046875,42303.8984375,16972.994140625,13610.57421875,23774.564453125,78457.2109375,25270.17578125,38336.52734375,20738.1953125'
# convert the values in bytes format to array
def bytes_2_array(x):
l = str(x).split(',')
l[0] = l[0][2:]
l[-1] = l[-1][:-1]
for i in range(len(l)):
l[i] = float(l[i])
l = np.array(l).astype('float32')
return l.reshape(-1,1)
predicted_values = bytes_2_array(predictions)
predicted_values
array([[29380.959],
[32496.432],
[13725.95 ],
[23666.615],
[16972.994],
[26912.982],
[26309.082],
[26398.71 ],
[21943.902],
[31553.564],
[16337.021],
[16972.994],
[33116.05 ],
[44037.21 ],
[21785.393],
[13725.95 ],
[97984.78 ],
[30746.87 ],
[22088.959],
[38233.844],
[37346.73 ],
[23325.234],
[20753.848],
[34956.23 ],
[42303.9 ],
[16972.994],
[13610.574],
[23774.564],
[78457.21 ],
[25270.176],
[38336.527],
[20738.195]], dtype=float32)
# collect KPI metrics by comparing predicted_values for test data to actual y_test values
k = X_test.shape[1]
n = len(X_test)
RMSE = float(format(np.sqrt(mean_squared_error(y_test, predicted_values)),'.3f'))
MSE = mean_squared_error(y_test, predicted_values)
MAE = mean_absolute_error(y_test, predicted_values)
r2 = r2_score(y_test, predicted_values)
adj_r2 = 1-(1-r2)*(n-1)/(n-k-1)
print('RMSE =',RMSE, '\nMSE =',MSE, '\nMAE =',MAE, '\nR2 =', r2, '\nAdjusted R2 =', adj_r2)
RMSE = 5220.492 MSE = 27253538.0 MAE = 3267.0022 R2 = 0.9364099860293775 Adjusted R2 = 1.004361262020109
# Delete the endpoint
predictor.delete_endpoint()
INFO:sagemaker:Deleting endpoint configuration with name: sagemaker-xgboost-2023-07-12-04-44-59-841 INFO:sagemaker:Deleting endpoint with name: sagemaker-xgboost-2023-07-12-04-44-59-841
regressor = sagemaker.estimator.Estimator(container,
role,
instance_count = 1,
instance_type = 'ml.m4.xlarge',
input_mode = 'File',
use_spot_instances = True,
max_run = 300,
max_wait = 600,
output_path = output_location,
sagemaker_session = sagemaker_session)
regressor.set_hyperparameters(objective = 'reg:linear',
early_stopping_rounds = 10,
num_round = 150)
regressor
<sagemaker.estimator.Estimator at 0x7fb4eb14fd90>
# defiine hyperparameter ranges
hyperparameter_ranges = {'eta': ContinuousParameter(0.3, 1),
'alpha': ContinuousParameter(0.1, 0.4),
'max_depth': IntegerParameter(8, 15)}
hyperparameter_ranges
{'eta': <sagemaker.parameter.ContinuousParameter at 0x7fb4eb14ff50>,
'alpha': <sagemaker.parameter.ContinuousParameter at 0x7fb4eb14ff90>,
'max_depth': <sagemaker.parameter.IntegerParameter at 0x7fb4eb14f390>}
# create hyperparameters tuning job
tuner = HyperparameterTuner(regressor,
objective_metric_name = 'validation:rmse',
hyperparameter_ranges = hyperparameter_ranges,
objective_type = 'Minimize',
max_jobs = 5,
max_parallel_jobs = 1)
tuner
<sagemaker.tuner.HyperparameterTuner at 0x7fb4eb133890>
# run tuning job
tuner.fit(data_channels)
WARNING:sagemaker.estimator:No finished training job found associated with this estimator. Please make sure this estimator is only used for building workflow config WARNING:sagemaker.estimator:No finished training job found associated with this estimator. Please make sure this estimator is only used for building workflow config INFO:sagemaker:Creating hyperparameter tuning job with name: sagemaker-xgboost-230712-0449
.................................................................................................................................................................................................................................................................!
# view hyperparameter output for tuning job (the 'latest' ie. best configuration identified)
summary = HyperparameterTuningJobAnalytics(sagemaker_session = sagemaker_session,
hyperparameter_tuning_job_name = tuner.latest_tuning_job.name).dataframe()
summary
| alpha | eta | max_depth | TrainingJobName | TrainingJobStatus | FinalObjectiveValue | TrainingStartTime | TrainingEndTime | TrainingElapsedTimeSeconds | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.388265 | 0.623155 | 8.0 | sagemaker-xgboost-230712-0449-005-f1dacc09 | Completed | 4786.978027 | 2023-07-12 05:08:56+00:00 | 2023-07-12 05:10:48+00:00 | 112.0 |
| 1 | 0.272014 | 0.631030 | 8.0 | sagemaker-xgboost-230712-0449-004-48d506c6 | Completed | 4534.644043 | 2023-07-12 05:04:54+00:00 | 2023-07-12 05:06:46+00:00 | 112.0 |
| 2 | 0.166181 | 0.340352 | 15.0 | sagemaker-xgboost-230712-0449-003-e1105a7d | Completed | 4374.461914 | 2023-07-12 04:58:52+00:00 | 2023-07-12 05:00:39+00:00 | 107.0 |
| 3 | 0.297940 | 0.599598 | 13.0 | sagemaker-xgboost-230712-0449-002-749d0328 | Completed | 3404.736084 | 2023-07-12 04:54:44+00:00 | 2023-07-12 04:56:36+00:00 | 112.0 |
| 4 | 0.210109 | 0.904200 | 11.0 | sagemaker-xgboost-230712-0449-001-bddad2de | Completed | 6130.155762 | 2023-07-12 04:50:43+00:00 | 2023-07-12 04:52:35+00:00 | 112.0 |
# deploy the best model after conducting the hyperparameters tuning job
predictor = tuner.deploy(initial_instance_count = 1, instance_type = "ml.m4.xlarge")
2023-07-12 04:56:36 Starting - Preparing the instances for training 2023-07-12 04:56:36 Downloading - Downloading input data 2023-07-12 04:56:36 Training - Training image download completed. Training in progress. 2023-07-12 04:56:36 Uploading - Uploading generated training model 2023-07-12 04:56:36 Completed - Training job completed
INFO:sagemaker:Creating model with name: sagemaker-xgboost-2023-07-12-05-11-10-095
INFO:sagemaker:Creating endpoint-config with name sagemaker-xgboost-230712-0449-002-749d0328 INFO:sagemaker:Creating endpoint with name sagemaker-xgboost-230712-0449-002-749d0328
--------!
# obtain predictions for test data
predictor.serializer = CSVSerializer()
predictions = predictor.predict(X_test)
predictions
b'34032.72265625,31580.109375,11508.255859375,25243.3515625,19214.154296875,25387.94921875,30698.2578125,27807.458984375,25387.94921875,26328.3671875,19214.154296875,15356.6298828125,29849.06640625,45507.7265625,21479.681640625,11508.255859375,130988.34375,32051.541015625,19522.494140625,40379.48046875,35880.30859375,23644.939453125,21501.56640625,29237.32421875,46282.82421875,19214.154296875,16456.95703125,22176.21875,76415.90625,23881.291015625,38966.16015625,20294.609375'
# convert the values in bytes format to array
def bytes_2_array(x):
l = str(x).split(',')
l[0] = l[0][2:]
l[-1] = l[-1][:-1]
for i in range(len(l)):
l[i] = float(l[i])
l = np.array(l).astype('float32')
return l.reshape(-1,1)
predicted_values = bytes_2_array(predictions)
predicted_values
array([[ 34032.723],
[ 31580.11 ],
[ 11508.256],
[ 25243.352],
[ 19214.154],
[ 25387.95 ],
[ 30698.258],
[ 27807.459],
[ 25387.95 ],
[ 26328.367],
[ 19214.154],
[ 15356.63 ],
[ 29849.066],
[ 45507.727],
[ 21479.682],
[ 11508.256],
[130988.34 ],
[ 32051.541],
[ 19522.494],
[ 40379.48 ],
[ 35880.31 ],
[ 23644.94 ],
[ 21501.566],
[ 29237.324],
[ 46282.824],
[ 19214.154],
[ 16456.957],
[ 22176.219],
[ 76415.91 ],
[ 23881.291],
[ 38966.16 ],
[ 20294.61 ]], dtype=float32)
# collect KPI metrics by comparing predicted_values for test data to actual y_test values
k = X_test.shape[1]
n = len(X_test)
RMSE = float(format(np.sqrt(mean_squared_error(y_test, predicted_values)),'.3f'))
MSE = mean_squared_error(y_test, predicted_values)
MAE = mean_absolute_error(y_test, predicted_values)
r2 = r2_score(y_test, predicted_values)
adj_r2 = 1-(1-r2)*(n-1)/(n-k-1)
print('RMSE =',RMSE, '\nMSE =',MSE, '\nMAE =',MAE, '\nR2 =', r2, '\nAdjusted R2 =', adj_r2)
RMSE = 3314.265 MSE = 10984354.0 MAE = 2467.288 R2 = 0.974370474635362 Adjusted R2 = 1.001757777182088
# Delete the endpoint
predictor.delete_endpoint()
INFO:sagemaker:Deleting endpoint configuration with name: sagemaker-xgboost-230712-0449-002-749d0328 INFO:sagemaker:Deleting endpoint with name: sagemaker-xgboost-230712-0449-002-749d0328